R Markdown

library(nnet)
library(rattle)
## Warning: package 'rattle' was built under R version 3.4.4
## Rattle: A free graphical interface for data science with R.
## Version 5.2.0 Copyright (c) 2006-2018 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(rpart)
## Warning: package 'rpart' was built under R version 3.4.3
library(MASS)
library(stargazer)
## Warning: package 'stargazer' was built under R version 3.4.4
## 
## Please cite as:
##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(DT)
## Warning: package 'DT' was built under R version 3.4.3
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.4
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
## 
##     select
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.4.4

Read and check the train and test data

train<-read.csv("~/Desktop/Programming for analytics/week 6/Prudential/train.csv")
test<-read.csv("~/Desktop/Programming for analytics/week 6/Prudential/test.csv")
head(train)
##   Id Product_Info_1 Product_Info_2 Product_Info_3 Product_Info_4
## 1  2              1             D3             10     0.07692308
## 2  5              1             A1             26     0.07692308
## 3  6              1             E1             26     0.07692308
## 4  7              1             D4             10     0.48717949
## 5  8              1             D2             26     0.23076923
## 6 10              1             D2             26     0.23076923
##   Product_Info_5 Product_Info_6 Product_Info_7    Ins_Age        Ht
## 1              2              1              1 0.64179104 0.5818182
## 2              2              3              1 0.05970149 0.6000000
## 3              2              3              1 0.02985075 0.7454545
## 4              2              3              1 0.16417910 0.6727273
## 5              2              3              1 0.41791045 0.6545455
## 6              3              1              1 0.50746269 0.8363636
##          Wt       BMI Employment_Info_1 Employment_Info_2
## 1 0.1485356 0.3230080             0.028                12
## 2 0.1317992 0.2722877             0.000                 1
## 3 0.2887029 0.4287804             0.030                 9
## 4 0.2050209 0.3524377             0.042                 9
## 5 0.2343096 0.4240456             0.027                 9
## 6 0.2991632 0.3648867             0.325                15
##   Employment_Info_3 Employment_Info_4 Employment_Info_5 Employment_Info_6
## 1                 1                 0                 3                NA
## 2                 3                 0                 2            0.0018
## 3                 1                 0                 2            0.0300
## 4                 1                 0                 3            0.2000
## 5                 1                 0                 2            0.0500
## 6                 1                 0                 2            1.0000
##   InsuredInfo_1 InsuredInfo_2 InsuredInfo_3 InsuredInfo_4 InsuredInfo_5
## 1             1             2             6             3             1
## 2             1             2             6             3             1
## 3             1             2             8             3             1
## 4             2             2             8             3             1
## 5             1             2             6             3             1
## 6             1             2             8             3             1
##   InsuredInfo_6 InsuredInfo_7 Insurance_History_1 Insurance_History_2
## 1             2             1                   1                   1
## 2             2             1                   2                   1
## 3             1             1                   2                   1
## 4             2             1                   2                   1
## 5             2             1                   2                   1
## 6             1             1                   2                   1
##   Insurance_History_3 Insurance_History_4 Insurance_History_5
## 1                   3                   1         0.000666667
## 2                   3                   1         0.000133333
## 3                   1                   3                  NA
## 4                   1                   3                  NA
## 5                   1                   3                  NA
## 6                   3                   2         0.005000000
##   Insurance_History_7 Insurance_History_8 Insurance_History_9
## 1                   1                   1                   2
## 2                   1                   3                   2
## 3                   3                   2                   3
## 4                   3                   2                   3
## 5                   3                   2                   3
## 6                   1                   3                   2
##   Family_Hist_1 Family_Hist_2 Family_Hist_3 Family_Hist_4 Family_Hist_5
## 1             2            NA     0.5980392            NA     0.5267857
## 2             2     0.1884058            NA    0.08450704            NA
## 3             3     0.3043478            NA    0.22535211            NA
## 4             3     0.4202899            NA    0.35211268            NA
## 5             2     0.4637681            NA    0.40845070            NA
## 6             2            NA     0.2941176    0.50704225            NA
##   Medical_History_1 Medical_History_2 Medical_History_3 Medical_History_4
## 1                 4               112                 2                 1
## 2                 5               412                 2                 1
## 3                10                 3                 2                 2
## 4                 0               350                 2                 2
## 5                NA               162                 2                 2
## 6                 6               491                 2                 2
##   Medical_History_5 Medical_History_6 Medical_History_7 Medical_History_8
## 1                 1                 3                 2                 2
## 2                 1                 3                 2                 2
## 3                 1                 3                 2                 2
## 4                 1                 3                 2                 2
## 5                 1                 3                 2                 2
## 6                 1                 3                 2                 2
##   Medical_History_9 Medical_History_10 Medical_History_11
## 1                 1                 NA                  3
## 2                 1                 NA                  3
## 3                 2                 NA                  3
## 4                 2                 NA                  3
## 5                 2                 NA                  3
## 6                 2                 NA                  3
##   Medical_History_12 Medical_History_13 Medical_History_14
## 1                  2                  3                  3
## 2                  2                  3                  3
## 3                  2                  3                  3
## 4                  2                  3                  3
## 5                  2                  3                  3
## 6                  2                  3                  3
##   Medical_History_15 Medical_History_16 Medical_History_17
## 1                240                  3                  3
## 2                  0                  1                  3
## 3                 NA                  1                  3
## 4                 NA                  1                  3
## 5                 NA                  1                  3
## 6                 NA                  1                  3
##   Medical_History_18 Medical_History_19 Medical_History_20
## 1                  1                  1                  2
## 2                  1                  1                  2
## 3                  1                  1                  2
## 4                  1                  1                  2
## 5                  1                  1                  2
## 6                  2                  1                  2
##   Medical_History_21 Medical_History_22 Medical_History_23
## 1                  1                  2                  3
## 2                  1                  2                  3
## 3                  1                  2                  3
## 4                  2                  2                  3
## 5                  1                  2                  3
## 6                  2                  2                  3
##   Medical_History_24 Medical_History_25 Medical_History_26
## 1                 NA                  1                  3
## 2                 NA                  1                  3
## 3                 NA                  2                  2
## 4                 NA                  1                  3
## 5                 NA                  2                  2
## 6                 NA                  1                  3
##   Medical_History_27 Medical_History_28 Medical_History_29
## 1                  3                  1                  3
## 2                  3                  1                  3
## 3                  3                  1                  3
## 4                  3                  1                  3
## 5                  3                  1                  3
## 6                  3                  1                  3
##   Medical_History_30 Medical_History_31 Medical_History_32
## 1                  2                  3                 NA
## 2                  2                  3                 NA
## 3                  2                  3                 NA
## 4                  2                  3                 NA
## 5                  2                  3                 NA
## 6                  2                  3                 NA
##   Medical_History_33 Medical_History_34 Medical_History_35
## 1                  1                  3                  1
## 2                  3                  1                  1
## 3                  3                  3                  1
## 4                  3                  3                  1
## 5                  3                  3                  1
## 6                  3                  1                  1
##   Medical_History_36 Medical_History_37 Medical_History_38
## 1                  2                  2                  1
## 2                  2                  2                  1
## 3                  3                  2                  1
## 4                  2                  2                  1
## 5                  3                  2                  1
## 6                  2                  2                  1
##   Medical_History_39 Medical_History_40 Medical_History_41
## 1                  3                  3                  3
## 2                  3                  3                  1
## 3                  3                  3                  1
## 4                  3                  3                  1
## 5                  3                  3                  1
## 6                  3                  3                  3
##   Medical_Keyword_1 Medical_Keyword_2 Medical_Keyword_3 Medical_Keyword_4
## 1                 0                 0                 0                 0
## 2                 0                 0                 0                 0
## 3                 0                 0                 0                 0
## 4                 0                 0                 0                 0
## 5                 0                 0                 0                 0
## 6                 0                 0                 0                 0
##   Medical_Keyword_5 Medical_Keyword_6 Medical_Keyword_7 Medical_Keyword_8
## 1                 0                 0                 0                 0
## 2                 0                 0                 0                 0
## 3                 0                 0                 0                 0
## 4                 0                 0                 0                 0
## 5                 0                 0                 0                 0
## 6                 0                 0                 0                 0
##   Medical_Keyword_9 Medical_Keyword_10 Medical_Keyword_11
## 1                 0                  0                  0
## 2                 0                  0                  0
## 3                 0                  0                  0
## 4                 0                  0                  0
## 5                 0                  0                  0
## 6                 0                  0                  0
##   Medical_Keyword_12 Medical_Keyword_13 Medical_Keyword_14
## 1                  0                  0                  0
## 2                  0                  0                  0
## 3                  0                  0                  0
## 4                  0                  0                  0
## 5                  0                  0                  0
## 6                  0                  0                  0
##   Medical_Keyword_15 Medical_Keyword_16 Medical_Keyword_17
## 1                  0                  0                  0
## 2                  0                  0                  0
## 3                  0                  0                  0
## 4                  0                  0                  0
## 5                  0                  0                  0
## 6                  0                  0                  0
##   Medical_Keyword_18 Medical_Keyword_19 Medical_Keyword_20
## 1                  0                  0                  0
## 2                  0                  0                  0
## 3                  0                  0                  0
## 4                  0                  0                  0
## 5                  0                  0                  0
## 6                  0                  0                  0
##   Medical_Keyword_21 Medical_Keyword_22 Medical_Keyword_23
## 1                  0                  0                  0
## 2                  0                  0                  0
## 3                  0                  0                  0
## 4                  0                  0                  0
## 5                  0                  0                  0
## 6                  0                  1                  0
##   Medical_Keyword_24 Medical_Keyword_25 Medical_Keyword_26
## 1                  0                  0                  0
## 2                  0                  0                  0
## 3                  0                  0                  0
## 4                  0                  0                  0
## 5                  0                  0                  0
## 6                  0                  0                  0
##   Medical_Keyword_27 Medical_Keyword_28 Medical_Keyword_29
## 1                  0                  0                  0
## 2                  0                  0                  0
## 3                  0                  0                  0
## 4                  0                  0                  0
## 5                  0                  0                  0
## 6                  0                  0                  0
##   Medical_Keyword_30 Medical_Keyword_31 Medical_Keyword_32
## 1                  0                  0                  0
## 2                  0                  0                  0
## 3                  0                  0                  0
## 4                  0                  0                  1
## 5                  0                  0                  0
## 6                  0                  0                  0
##   Medical_Keyword_33 Medical_Keyword_34 Medical_Keyword_35
## 1                  0                  0                  0
## 2                  0                  0                  0
## 3                  0                  0                  0
## 4                  0                  0                  0
## 5                  0                  0                  0
## 6                  0                  1                  0
##   Medical_Keyword_36 Medical_Keyword_37 Medical_Keyword_38
## 1                  0                  0                  0
## 2                  0                  0                  0
## 3                  0                  0                  0
## 4                  0                  0                  0
## 5                  0                  0                  0
## 6                  0                  0                  0
##   Medical_Keyword_39 Medical_Keyword_40 Medical_Keyword_41
## 1                  0                  0                  0
## 2                  0                  0                  0
## 3                  0                  0                  0
## 4                  0                  0                  0
## 5                  0                  0                  0
## 6                  0                  0                  0
##   Medical_Keyword_42 Medical_Keyword_43 Medical_Keyword_44
## 1                  0                  0                  0
## 2                  0                  0                  0
## 3                  0                  0                  0
## 4                  0                  0                  0
## 5                  0                  0                  0
## 6                  0                  0                  0
##   Medical_Keyword_45 Medical_Keyword_46 Medical_Keyword_47
## 1                  0                  0                  0
## 2                  0                  0                  0
## 3                  0                  0                  0
## 4                  0                  0                  0
## 5                  0                  0                  0
## 6                  0                  0                  0
##   Medical_Keyword_48 Response
## 1                  0        8
## 2                  0        4
## 3                  0        8
## 4                  0        8
## 5                  0        8
## 6                  0        8
head(test)
##   Id Product_Info_1 Product_Info_2 Product_Info_3 Product_Info_4
## 1  1              1             D3             26     0.48717949
## 2  3              1             A2             26     0.07692308
## 3  4              1             D3             26     0.14466667
## 4  9              1             A1             26     0.15170872
## 5 12              1             A1             26     0.07692308
## 6 13              1             D3             26     0.23076923
##   Product_Info_5 Product_Info_6 Product_Info_7   Ins_Age        Ht
## 1              2              3              1 0.6119403 0.7818182
## 2              2              3              1 0.6268657 0.7272727
## 3              2              3              1 0.5820896 0.7090909
## 4              2              1              1 0.5223881 0.6545455
## 5              2              3              1 0.2985075 0.6727273
## 6              2              3              1 0.5671642 0.8181818
##          Wt       BMI Employment_Info_1 Employment_Info_2
## 1 0.3389121 0.4722616             0.150                 3
## 2 0.3117155 0.4849840             0.000                 1
## 3 0.3200837 0.5191032             0.143                 9
## 4 0.2677824 0.4869621             0.210                 9
## 5 0.2468619 0.4287182             0.085                 9
## 6 0.2991632 0.3797544             0.075                 9
##   Employment_Info_3 Employment_Info_4 Employment_Info_5 Employment_Info_6
## 1                 1              0.00                 2              0.50
## 2                 3              0.07                 2              0.20
## 3                 1              0.00                 2              0.45
## 4                 1              0.00                 2              1.00
## 5                 1              0.00                 2              0.20
## 6                 1              0.00                 2              0.40
##   InsuredInfo_1 InsuredInfo_2 InsuredInfo_3 InsuredInfo_4 InsuredInfo_5
## 1             2             2            11             3             1
## 2             1             2             8             3             1
## 3             1             2             3             3             1
## 4             2             2             3             3             1
## 5             1             2             8             3             1
## 6             1             2             8             3             1
##   InsuredInfo_6 InsuredInfo_7 Insurance_History_1 Insurance_History_2
## 1             1             1                   2                   1
## 2             1             1                   1                   1
## 3             1             1                   2                   1
## 4             1             1                   1                   1
## 5             2             1                   2                   1
## 6             1             1                   2                   1
##   Insurance_History_3 Insurance_History_4 Insurance_History_5
## 1                   1                   3                  NA
## 2                   3                   1         0.001666667
## 3                   1                   3                  NA
## 4                   3                   1         0.000666667
## 5                   1                   3                  NA
## 6                   1                   3                  NA
##   Insurance_History_7 Insurance_History_8 Insurance_History_9
## 1                   3                   2                   3
## 2                   1                   1                   2
## 3                   3                   2                   3
## 4                   2                   1                   2
## 5                   3                   2                   3
## 6                   3                   2                   3
##   Family_Hist_1 Family_Hist_2 Family_Hist_3 Family_Hist_4 Family_Hist_5
## 1             3            NA     0.6274510     0.7605634            NA
## 2             2            NA     0.5294118     0.7464789            NA
## 3             3     0.6666667            NA     0.6619718            NA
## 4             2            NA     0.6862745     0.6760563            NA
## 5             2     0.4492754            NA     0.3802817            NA
## 6             3            NA     0.6470588            NA     0.5535714
##   Medical_History_1 Medical_History_2 Medical_History_3 Medical_History_4
## 1                 2                16                 2                 2
## 2                 5               261                 3                 1
## 3                 3               132                 2                 1
## 4                NA               162                 3                 2
## 5                18               181                 3                 1
## 6                 4               335                 2                 2
##   Medical_History_5 Medical_History_6 Medical_History_7 Medical_History_8
## 1                 1                 3                 1                 2
## 2                 1                 3                 2                 2
## 3                 1                 3                 2                 2
## 4                 1                 1                 2                 3
## 5                 1                 3                 2                 2
## 6                 1                 3                 2                 2
##   Medical_History_9 Medical_History_10 Medical_History_11
## 1                 2                 NA                  3
## 2                 1                 NA                  3
## 3                 2                 NA                  3
## 4                 2                 NA                  3
## 5                 2                 NA                  3
## 6                 2                 NA                  3
##   Medical_History_12 Medical_History_13 Medical_History_14
## 1                  2                  1                  3
## 2                  2                  3                  3
## 3                  2                  3                  3
## 4                  2                  3                  3
## 5                  2                  3                  3
## 6                  2                  3                  3
##   Medical_History_15 Medical_History_16 Medical_History_17
## 1                 NA                  1                  2
## 2                110                  3                  3
## 3                240                  1                  3
## 4                 NA                  1                  3
## 5                188                  1                  3
## 6                 NA                  1                  3
##   Medical_History_18 Medical_History_19 Medical_History_20
## 1                  1                  1                  2
## 2                  1                  1                  2
## 3                  1                  1                  2
## 4                  1                  1                  2
## 5                  1                  1                  2
## 6                  1                  1                  2
##   Medical_History_21 Medical_History_22 Medical_History_23
## 1                  1                  2                  1
## 2                  1                  2                  3
## 3                  1                  2                  3
## 4                  2                  2                  3
## 5                  1                  2                  1
## 6                  1                  2                  3
##   Medical_History_24 Medical_History_25 Medical_History_26
## 1                 NA                  2                  2
## 2                 NA                  2                  2
## 3                 NA                  2                  2
## 4                 NA                  1                  3
## 5                 NA                  1                  3
## 6                 NA                  2                  2
##   Medical_History_27 Medical_History_28 Medical_History_29
## 1                  1                  1                  3
## 2                  3                  1                  3
## 3                  3                  1                  1
## 4                  3                  2                  3
## 5                  3                  1                  1
## 6                  3                  1                  3
##   Medical_History_30 Medical_History_31 Medical_History_32
## 1                  2                  3                 NA
## 2                  2                  3                 NA
## 3                  2                  3                 NA
## 4                  2                  3                 NA
## 5                  2                  3                 NA
## 6                  2                  3                 NA
##   Medical_History_33 Medical_History_34 Medical_History_35
## 1                  3                  3                  1
## 2                  3                  3                  1
## 3                  1                  3                  1
## 4                  3                  1                  1
## 5                  3                  3                  1
## 6                  3                  3                  1
##   Medical_History_36 Medical_History_37 Medical_History_38
## 1                  3                  2                  1
## 2                  3                  2                  1
## 3                  3                  2                  1
## 4                  2                  2                  1
## 5                  2                  2                  1
## 6                  3                  2                  1
##   Medical_History_39 Medical_History_40 Medical_History_41
## 1                  3                  3                  3
## 2                  3                  3                  1
## 3                  3                  3                  3
## 4                  3                  3                  3
## 5                  3                  3                  3
## 6                  3                  3                  1
##   Medical_Keyword_1 Medical_Keyword_2 Medical_Keyword_3 Medical_Keyword_4
## 1                 0                 0                 0                 0
## 2                 0                 0                 0                 0
## 3                 0                 0                 0                 0
## 4                 0                 0                 0                 0
## 5                 0                 0                 0                 0
## 6                 0                 0                 0                 0
##   Medical_Keyword_5 Medical_Keyword_6 Medical_Keyword_7 Medical_Keyword_8
## 1                 0                 0                 0                 0
## 2                 0                 0                 0                 0
## 3                 0                 0                 0                 0
## 4                 0                 0                 0                 0
## 5                 0                 0                 0                 0
## 6                 0                 0                 0                 0
##   Medical_Keyword_9 Medical_Keyword_10 Medical_Keyword_11
## 1                 0                  0                  0
## 2                 0                  0                  0
## 3                 0                  1                  0
## 4                 0                  0                  0
## 5                 0                  0                  0
## 6                 0                  0                  0
##   Medical_Keyword_12 Medical_Keyword_13 Medical_Keyword_14
## 1                  0                  0                  0
## 2                  0                  0                  0
## 3                  0                  0                  0
## 4                  0                  0                  0
## 5                  0                  0                  0
## 6                  0                  0                  0
##   Medical_Keyword_15 Medical_Keyword_16 Medical_Keyword_17
## 1                  1                  0                  0
## 2                  0                  0                  0
## 3                  0                  0                  0
## 4                  0                  0                  0
## 5                  0                  0                  0
## 6                  0                  0                  0
##   Medical_Keyword_18 Medical_Keyword_19 Medical_Keyword_20
## 1                  0                  1                  0
## 2                  0                  0                  0
## 3                  0                  0                  0
## 4                  0                  0                  0
## 5                  0                  0                  0
## 6                  0                  0                  0
##   Medical_Keyword_21 Medical_Keyword_22 Medical_Keyword_23
## 1                  0                  0                  0
## 2                  0                  0                  0
## 3                  0                  0                  1
## 4                  0                  0                  0
## 5                  0                  0                  0
## 6                  0                  0                  0
##   Medical_Keyword_24 Medical_Keyword_25 Medical_Keyword_26
## 1                  0                  1                  0
## 2                  0                  0                  0
## 3                  0                  1                  0
## 4                  0                  0                  0
## 5                  0                  1                  0
## 6                  0                  0                  0
##   Medical_Keyword_27 Medical_Keyword_28 Medical_Keyword_29
## 1                  0                  0                  0
## 2                  0                  0                  0
## 3                  0                  0                  0
## 4                  0                  0                  0
## 5                  0                  0                  0
## 6                  0                  0                  0
##   Medical_Keyword_30 Medical_Keyword_31 Medical_Keyword_32
## 1                  0                  0                  0
## 2                  0                  0                  0
## 3                  0                  0                  0
## 4                  0                  0                  0
## 5                  0                  0                  0
## 6                  0                  0                  0
##   Medical_Keyword_33 Medical_Keyword_34 Medical_Keyword_35
## 1                  0                  0                  0
## 2                  0                  0                  0
## 3                  0                  0                  0
## 4                  0                  0                  0
## 5                  0                  0                  0
## 6                  0                  0                  0
##   Medical_Keyword_36 Medical_Keyword_37 Medical_Keyword_38
## 1                  0                  0                  0
## 2                  0                  0                  0
## 3                  0                  0                  0
## 4                  0                  1                  0
## 5                  0                  0                  0
## 6                  0                  0                  0
##   Medical_Keyword_39 Medical_Keyword_40 Medical_Keyword_41
## 1                  0                  0                  0
## 2                  0                  0                  0
## 3                  0                  0                  0
## 4                  0                  0                  0
## 5                  0                  0                  0
## 6                  0                  0                  0
##   Medical_Keyword_42 Medical_Keyword_43 Medical_Keyword_44
## 1                  0                  0                  0
## 2                  0                  0                  0
## 3                  0                  0                  0
## 4                  0                  0                  0
## 5                  0                  0                  0
## 6                  0                  0                  0
##   Medical_Keyword_45 Medical_Keyword_46 Medical_Keyword_47
## 1                  0                  0                  0
## 2                  0                  0                  0
## 3                  0                  0                  0
## 4                  0                  0                  1
## 5                  0                  0                  0
## 6                  0                  0                  0
##   Medical_Keyword_48
## 1                  0
## 2                  0
## 3                  0
## 4                  1
## 5                  0
## 6                  0
str (train)
## 'data.frame':    59381 obs. of  128 variables:
##  $ Id                 : int  2 5 6 7 8 10 11 14 15 16 ...
##  $ Product_Info_1     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Product_Info_2     : Factor w/ 19 levels "A1","A2","A3",..: 17 1 19 18 16 16 8 16 17 19 ...
##  $ Product_Info_3     : int  10 26 26 10 26 26 10 26 26 21 ...
##  $ Product_Info_4     : num  0.0769 0.0769 0.0769 0.4872 0.2308 ...
##  $ Product_Info_5     : int  2 2 2 2 2 3 2 2 2 2 ...
##  $ Product_Info_6     : int  1 3 3 3 3 1 3 3 3 3 ...
##  $ Product_Info_7     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Ins_Age            : num  0.6418 0.0597 0.0299 0.1642 0.4179 ...
##  $ Ht                 : num  0.582 0.6 0.745 0.673 0.655 ...
##  $ Wt                 : num  0.149 0.132 0.289 0.205 0.234 ...
##  $ BMI                : num  0.323 0.272 0.429 0.352 0.424 ...
##  $ Employment_Info_1  : num  0.028 0 0.03 0.042 0.027 0.325 0.11 0.12 0.165 0.025 ...
##  $ Employment_Info_2  : int  12 1 9 9 9 15 1 12 9 1 ...
##  $ Employment_Info_3  : int  1 3 1 1 1 1 3 1 1 3 ...
##  $ Employment_Info_4  : num  0 0 0 0 0 0 NA 0 0 0 ...
##  $ Employment_Info_5  : int  3 2 2 3 2 2 3 2 2 3 ...
##  $ Employment_Info_6  : num  NA 0.0018 0.03 0.2 0.05 1 0.8 1 1 0.05 ...
##  $ InsuredInfo_1      : int  1 1 1 2 1 1 1 1 1 2 ...
##  $ InsuredInfo_2      : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ InsuredInfo_3      : int  6 6 8 8 6 8 3 6 3 3 ...
##  $ InsuredInfo_4      : int  3 3 3 3 3 3 3 3 2 3 ...
##  $ InsuredInfo_5      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ InsuredInfo_6      : int  2 2 1 2 2 1 2 1 1 2 ...
##  $ InsuredInfo_7      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Insurance_History_1: int  1 2 2 2 2 2 1 1 1 2 ...
##  $ Insurance_History_2: int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Insurance_History_3: int  3 3 1 1 1 3 3 3 3 3 ...
##  $ Insurance_History_4: int  1 1 3 3 3 2 2 1 2 1 ...
##  $ Insurance_History_5: num  0.000667 0.000133 NA NA NA ...
##  $ Insurance_History_7: int  1 1 3 3 3 1 1 1 1 1 ...
##  $ Insurance_History_8: int  1 3 2 2 2 3 1 1 1 3 ...
##  $ Insurance_History_9: int  2 2 3 3 3 2 2 2 2 2 ...
##  $ Family_Hist_1      : int  2 2 3 3 2 2 3 2 3 3 ...
##  $ Family_Hist_2      : num  NA 0.188 0.304 0.42 0.464 ...
##  $ Family_Hist_3      : num  0.598 NA NA NA NA ...
##  $ Family_Hist_4      : num  NA 0.0845 0.2254 0.3521 0.4085 ...
##  $ Family_Hist_5      : num  0.527 NA NA NA NA ...
##  $ Medical_History_1  : int  4 5 10 0 NA 6 5 6 4 NA ...
##  $ Medical_History_2  : int  112 412 3 350 162 491 600 145 16 162 ...
##  $ Medical_History_3  : int  2 2 2 2 2 2 3 2 2 2 ...
##  $ Medical_History_4  : int  1 1 2 2 2 2 2 2 2 2 ...
##  $ Medical_History_5  : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Medical_History_6  : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ Medical_History_7  : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ Medical_History_8  : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ Medical_History_9  : int  1 1 2 2 2 2 1 1 1 2 ...
##  $ Medical_History_10 : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ Medical_History_11 : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ Medical_History_12 : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ Medical_History_13 : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ Medical_History_14 : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ Medical_History_15 : int  240 0 NA NA NA NA NA NA NA NA ...
##  $ Medical_History_16 : int  3 1 1 1 1 1 1 1 1 3 ...
##  $ Medical_History_17 : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ Medical_History_18 : int  1 1 1 1 1 2 1 1 1 1 ...
##  $ Medical_History_19 : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Medical_History_20 : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ Medical_History_21 : int  1 1 1 2 1 2 1 1 1 1 ...
##  $ Medical_History_22 : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ Medical_History_23 : int  3 3 3 3 3 3 3 3 3 1 ...
##  $ Medical_History_24 : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ Medical_History_25 : int  1 1 2 1 2 1 1 1 1 1 ...
##  $ Medical_History_26 : int  3 3 2 3 2 3 3 3 3 3 ...
##  $ Medical_History_27 : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ Medical_History_28 : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Medical_History_29 : int  3 3 3 3 3 3 1 3 1 3 ...
##  $ Medical_History_30 : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ Medical_History_31 : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ Medical_History_32 : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ Medical_History_33 : int  1 3 3 3 3 3 3 3 3 3 ...
##  $ Medical_History_34 : int  3 1 3 3 3 1 3 3 3 3 ...
##  $ Medical_History_35 : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Medical_History_36 : int  2 2 3 2 3 2 2 2 2 2 ...
##  $ Medical_History_37 : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ Medical_History_38 : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Medical_History_39 : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ Medical_History_40 : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ Medical_History_41 : int  3 1 1 1 1 3 3 1 3 1 ...
##  $ Medical_Keyword_1  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Medical_Keyword_2  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Medical_Keyword_3  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Medical_Keyword_4  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Medical_Keyword_5  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Medical_Keyword_6  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Medical_Keyword_7  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Medical_Keyword_8  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Medical_Keyword_9  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Medical_Keyword_10 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Medical_Keyword_11 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Medical_Keyword_12 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Medical_Keyword_13 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Medical_Keyword_14 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Medical_Keyword_15 : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ Medical_Keyword_16 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Medical_Keyword_17 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Medical_Keyword_18 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Medical_Keyword_19 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Medical_Keyword_20 : int  0 0 0 0 0 0 0 0 1 0 ...
##   [list output truncated]
str(test)
## 'data.frame':    19765 obs. of  127 variables:
##  $ Id                 : int  1 3 4 9 12 13 21 28 30 36 ...
##  $ Product_Info_1     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Product_Info_2     : Factor w/ 19 levels "A1","A2","A3",..: 17 2 17 1 1 17 3 18 17 3 ...
##  $ Product_Info_3     : int  26 26 26 26 26 26 26 26 26 26 ...
##  $ Product_Info_4     : num  0.4872 0.0769 0.1447 0.1517 0.0769 ...
##  $ Product_Info_5     : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ Product_Info_6     : int  3 3 3 1 3 3 3 3 3 3 ...
##  $ Product_Info_7     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Ins_Age            : num  0.612 0.627 0.582 0.522 0.299 ...
##  $ Ht                 : num  0.782 0.727 0.709 0.655 0.673 ...
##  $ Wt                 : num  0.339 0.312 0.32 0.268 0.247 ...
##  $ BMI                : num  0.472 0.485 0.519 0.487 0.429 ...
##  $ Employment_Info_1  : num  0.15 0 0.143 0.21 0.085 0.075 0.14 0.025 0.035 0.06 ...
##  $ Employment_Info_2  : int  3 1 9 9 9 9 9 9 9 9 ...
##  $ Employment_Info_3  : int  1 3 1 1 1 1 1 1 1 1 ...
##  $ Employment_Info_4  : num  0 0.07 0 0 0 0 0 0 0 0 ...
##  $ Employment_Info_5  : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ Employment_Info_6  : num  0.5 0.2 0.45 1 0.2 0.4 1 0 NA 1 ...
##  $ InsuredInfo_1      : int  2 1 1 2 1 1 2 1 2 1 ...
##  $ InsuredInfo_2      : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ InsuredInfo_3      : int  11 8 3 3 8 8 3 2 8 8 ...
##  $ InsuredInfo_4      : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ InsuredInfo_5      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ InsuredInfo_6      : int  1 1 1 1 2 1 1 2 1 1 ...
##  $ InsuredInfo_7      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Insurance_History_1: int  2 1 2 1 2 2 2 2 2 2 ...
##  $ Insurance_History_2: int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Insurance_History_3: int  1 3 1 3 1 1 3 1 1 1 ...
##  $ Insurance_History_4: int  3 1 3 1 3 3 1 3 3 3 ...
##  $ Insurance_History_5: num  NA 0.001667 NA 0.000667 NA ...
##  $ Insurance_History_7: int  3 1 3 2 3 3 1 3 3 3 ...
##  $ Insurance_History_8: int  2 1 2 1 2 2 3 2 2 2 ...
##  $ Insurance_History_9: int  3 2 3 2 3 3 2 3 3 3 ...
##  $ Family_Hist_1      : int  3 2 3 2 2 3 2 2 3 3 ...
##  $ Family_Hist_2      : num  NA NA 0.667 NA 0.449 ...
##  $ Family_Hist_3      : num  0.627 0.529 NA 0.686 NA ...
##  $ Family_Hist_4      : num  0.761 0.746 0.662 0.676 0.38 ...
##  $ Family_Hist_5      : num  NA NA NA NA NA ...
##  $ Medical_History_1  : int  2 5 3 NA 18 4 21 0 2 NA ...
##  $ Medical_History_2  : int  16 261 132 162 181 335 112 491 112 162 ...
##  $ Medical_History_3  : int  2 3 2 3 3 2 2 2 2 3 ...
##  $ Medical_History_4  : int  2 1 1 2 1 2 1 2 2 2 ...
##  $ Medical_History_5  : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Medical_History_6  : int  3 3 3 1 3 3 3 3 3 3 ...
##  $ Medical_History_7  : int  1 2 2 2 2 2 2 2 2 2 ...
##  $ Medical_History_8  : int  2 2 2 3 2 2 2 2 2 2 ...
##  $ Medical_History_9  : int  2 1 2 2 2 2 1 2 2 2 ...
##  $ Medical_History_10 : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ Medical_History_11 : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ Medical_History_12 : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ Medical_History_13 : int  1 3 3 3 3 3 3 3 3 3 ...
##  $ Medical_History_14 : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ Medical_History_15 : int  NA 110 240 NA 188 NA 82 NA NA NA ...
##  $ Medical_History_16 : int  1 3 1 1 1 1 1 1 1 1 ...
##  $ Medical_History_17 : int  2 3 3 3 3 3 2 3 3 3 ...
##  $ Medical_History_18 : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Medical_History_19 : int  1 1 1 1 1 1 1 2 1 1 ...
##  $ Medical_History_20 : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ Medical_History_21 : int  1 1 1 2 1 1 1 1 1 1 ...
##  $ Medical_History_22 : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ Medical_History_23 : int  1 3 3 3 1 3 3 3 1 3 ...
##  $ Medical_History_24 : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ Medical_History_25 : int  2 2 2 1 1 2 1 1 1 1 ...
##  $ Medical_History_26 : int  2 2 2 3 3 2 3 3 3 3 ...
##  $ Medical_History_27 : int  1 3 3 3 3 3 3 3 3 3 ...
##  $ Medical_History_28 : int  1 1 1 2 1 1 1 1 1 1 ...
##  $ Medical_History_29 : int  3 3 1 3 1 3 3 3 3 3 ...
##  $ Medical_History_30 : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ Medical_History_31 : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ Medical_History_32 : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ Medical_History_33 : int  3 3 1 3 3 3 1 3 3 3 ...
##  $ Medical_History_34 : int  3 3 3 1 3 3 3 3 3 1 ...
##  $ Medical_History_35 : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Medical_History_36 : int  3 3 3 2 2 3 2 2 2 2 ...
##  $ Medical_History_37 : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ Medical_History_38 : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Medical_History_39 : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ Medical_History_40 : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ Medical_History_41 : int  3 1 3 3 3 1 1 3 1 1 ...
##  $ Medical_Keyword_1  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Medical_Keyword_2  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Medical_Keyword_3  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Medical_Keyword_4  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Medical_Keyword_5  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Medical_Keyword_6  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Medical_Keyword_7  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Medical_Keyword_8  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Medical_Keyword_9  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Medical_Keyword_10 : int  0 0 1 0 0 0 0 0 0 0 ...
##  $ Medical_Keyword_11 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Medical_Keyword_12 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Medical_Keyword_13 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Medical_Keyword_14 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Medical_Keyword_15 : int  1 0 0 0 0 0 0 0 1 0 ...
##  $ Medical_Keyword_16 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Medical_Keyword_17 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Medical_Keyword_18 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Medical_Keyword_19 : int  1 0 0 0 0 0 0 0 0 0 ...
##  $ Medical_Keyword_20 : int  0 0 0 0 0 0 0 0 0 0 ...
##   [list output truncated]
var_kind<-c("Product_Info_", "Ins_Age", "Ht", "Wt","BMI","Employment_Info_","InsuredInfo_",
                "Insurance_History_", "Family_Hist_","Medical_History_", "Medical_Keyword_")

Data cleaning

Removing variables with excess NAs in both test and train making a function with minimum threshold on train and test data As a preliminary step in data treatment, variables that have a high percentage of missing values are removed. While the threshold for removal is user determined, for this exercise the threshold was 30%.

sapply(train, function(x) sum(is.na(x)) )
##                  Id      Product_Info_1      Product_Info_2 
##                   0                   0                   0 
##      Product_Info_3      Product_Info_4      Product_Info_5 
##                   0                   0                   0 
##      Product_Info_6      Product_Info_7             Ins_Age 
##                   0                   0                   0 
##                  Ht                  Wt                 BMI 
##                   0                   0                   0 
##   Employment_Info_1   Employment_Info_2   Employment_Info_3 
##                  19                   0                   0 
##   Employment_Info_4   Employment_Info_5   Employment_Info_6 
##                6779                   0               10854 
##       InsuredInfo_1       InsuredInfo_2       InsuredInfo_3 
##                   0                   0                   0 
##       InsuredInfo_4       InsuredInfo_5       InsuredInfo_6 
##                   0                   0                   0 
##       InsuredInfo_7 Insurance_History_1 Insurance_History_2 
##                   0                   0                   0 
## Insurance_History_3 Insurance_History_4 Insurance_History_5 
##                   0                   0               25396 
## Insurance_History_7 Insurance_History_8 Insurance_History_9 
##                   0                   0                   0 
##       Family_Hist_1       Family_Hist_2       Family_Hist_3 
##                   0               28656               34241 
##       Family_Hist_4       Family_Hist_5   Medical_History_1 
##               19184               41811                8889 
##   Medical_History_2   Medical_History_3   Medical_History_4 
##                   0                   0                   0 
##   Medical_History_5   Medical_History_6   Medical_History_7 
##                   0                   0                   0 
##   Medical_History_8   Medical_History_9  Medical_History_10 
##                   0                   0               58824 
##  Medical_History_11  Medical_History_12  Medical_History_13 
##                   0                   0                   0 
##  Medical_History_14  Medical_History_15  Medical_History_16 
##                   0               44596                   0 
##  Medical_History_17  Medical_History_18  Medical_History_19 
##                   0                   0                   0 
##  Medical_History_20  Medical_History_21  Medical_History_22 
##                   0                   0                   0 
##  Medical_History_23  Medical_History_24  Medical_History_25 
##                   0               55580                   0 
##  Medical_History_26  Medical_History_27  Medical_History_28 
##                   0                   0                   0 
##  Medical_History_29  Medical_History_30  Medical_History_31 
##                   0                   0                   0 
##  Medical_History_32  Medical_History_33  Medical_History_34 
##               58274                   0                   0 
##  Medical_History_35  Medical_History_36  Medical_History_37 
##                   0                   0                   0 
##  Medical_History_38  Medical_History_39  Medical_History_40 
##                   0                   0                   0 
##  Medical_History_41   Medical_Keyword_1   Medical_Keyword_2 
##                   0                   0                   0 
##   Medical_Keyword_3   Medical_Keyword_4   Medical_Keyword_5 
##                   0                   0                   0 
##   Medical_Keyword_6   Medical_Keyword_7   Medical_Keyword_8 
##                   0                   0                   0 
##   Medical_Keyword_9  Medical_Keyword_10  Medical_Keyword_11 
##                   0                   0                   0 
##  Medical_Keyword_12  Medical_Keyword_13  Medical_Keyword_14 
##                   0                   0                   0 
##  Medical_Keyword_15  Medical_Keyword_16  Medical_Keyword_17 
##                   0                   0                   0 
##  Medical_Keyword_18  Medical_Keyword_19  Medical_Keyword_20 
##                   0                   0                   0 
##  Medical_Keyword_21  Medical_Keyword_22  Medical_Keyword_23 
##                   0                   0                   0 
##  Medical_Keyword_24  Medical_Keyword_25  Medical_Keyword_26 
##                   0                   0                   0 
##  Medical_Keyword_27  Medical_Keyword_28  Medical_Keyword_29 
##                   0                   0                   0 
##  Medical_Keyword_30  Medical_Keyword_31  Medical_Keyword_32 
##                   0                   0                   0 
##  Medical_Keyword_33  Medical_Keyword_34  Medical_Keyword_35 
##                   0                   0                   0 
##  Medical_Keyword_36  Medical_Keyword_37  Medical_Keyword_38 
##                   0                   0                   0 
##  Medical_Keyword_39  Medical_Keyword_40  Medical_Keyword_41 
##                   0                   0                   0 
##  Medical_Keyword_42  Medical_Keyword_43  Medical_Keyword_44 
##                   0                   0                   0 
##  Medical_Keyword_45  Medical_Keyword_46  Medical_Keyword_47 
##                   0                   0                   0 
##  Medical_Keyword_48            Response 
##                   0                   0
sapply(test, function(x) sum(is.na(x)) )
##                  Id      Product_Info_1      Product_Info_2 
##                   0                   0                   0 
##      Product_Info_3      Product_Info_4      Product_Info_5 
##                   0                   0                   0 
##      Product_Info_6      Product_Info_7             Ins_Age 
##                   0                   0                   0 
##                  Ht                  Wt                 BMI 
##                   0                   0                   0 
##   Employment_Info_1   Employment_Info_2   Employment_Info_3 
##                   3                   0                   0 
##   Employment_Info_4   Employment_Info_5   Employment_Info_6 
##                2137                   0                3787 
##       InsuredInfo_1       InsuredInfo_2       InsuredInfo_3 
##                   0                   0                   0 
##       InsuredInfo_4       InsuredInfo_5       InsuredInfo_6 
##                   0                   0                   0 
##       InsuredInfo_7 Insurance_History_1 Insurance_History_2 
##                   0                   0                   0 
## Insurance_History_3 Insurance_History_4 Insurance_History_5 
##                   0                   0                8105 
## Insurance_History_7 Insurance_History_8 Insurance_History_9 
##                   0                   0                   0 
##       Family_Hist_1       Family_Hist_2       Family_Hist_3 
##                   0                9880               11064 
##       Family_Hist_4       Family_Hist_5   Medical_History_1 
##                6677               13624                2972 
##   Medical_History_2   Medical_History_3   Medical_History_4 
##                   0                   0                   0 
##   Medical_History_5   Medical_History_6   Medical_History_7 
##                   0                   0                   0 
##   Medical_History_8   Medical_History_9  Medical_History_10 
##                   0                   0               19564 
##  Medical_History_11  Medical_History_12  Medical_History_13 
##                   0                   0                   0 
##  Medical_History_14  Medical_History_15  Medical_History_16 
##                   0               14864                   0 
##  Medical_History_17  Medical_History_18  Medical_History_19 
##                   0                   0                   0 
##  Medical_History_20  Medical_History_21  Medical_History_22 
##                   0                   0                   0 
##  Medical_History_23  Medical_History_24  Medical_History_25 
##                   0               18585                   0 
##  Medical_History_26  Medical_History_27  Medical_History_28 
##                   0                   0                   0 
##  Medical_History_29  Medical_History_30  Medical_History_31 
##                   0                   0                   0 
##  Medical_History_32  Medical_History_33  Medical_History_34 
##               19414                   0                   0 
##  Medical_History_35  Medical_History_36  Medical_History_37 
##                   0                   0                   0 
##  Medical_History_38  Medical_History_39  Medical_History_40 
##                   0                   0                   0 
##  Medical_History_41   Medical_Keyword_1   Medical_Keyword_2 
##                   0                   0                   0 
##   Medical_Keyword_3   Medical_Keyword_4   Medical_Keyword_5 
##                   0                   0                   0 
##   Medical_Keyword_6   Medical_Keyword_7   Medical_Keyword_8 
##                   0                   0                   0 
##   Medical_Keyword_9  Medical_Keyword_10  Medical_Keyword_11 
##                   0                   0                   0 
##  Medical_Keyword_12  Medical_Keyword_13  Medical_Keyword_14 
##                   0                   0                   0 
##  Medical_Keyword_15  Medical_Keyword_16  Medical_Keyword_17 
##                   0                   0                   0 
##  Medical_Keyword_18  Medical_Keyword_19  Medical_Keyword_20 
##                   0                   0                   0 
##  Medical_Keyword_21  Medical_Keyword_22  Medical_Keyword_23 
##                   0                   0                   0 
##  Medical_Keyword_24  Medical_Keyword_25  Medical_Keyword_26 
##                   0                   0                   0 
##  Medical_Keyword_27  Medical_Keyword_28  Medical_Keyword_29 
##                   0                   0                   0 
##  Medical_Keyword_30  Medical_Keyword_31  Medical_Keyword_32 
##                   0                   0                   0 
##  Medical_Keyword_33  Medical_Keyword_34  Medical_Keyword_35 
##                   0                   0                   0 
##  Medical_Keyword_36  Medical_Keyword_37  Medical_Keyword_38 
##                   0                   0                   0 
##  Medical_Keyword_39  Medical_Keyword_40  Medical_Keyword_41 
##                   0                   0                   0 
##  Medical_Keyword_42  Medical_Keyword_43  Medical_Keyword_44 
##                   0                   0                   0 
##  Medical_Keyword_45  Medical_Keyword_46  Medical_Keyword_47 
##                   0                   0                   0 
##  Medical_Keyword_48 
##                   0
rmNAvars<-function(dat,threshold){
      dat<-dat[, -which(colMeans(is.na(dat)) > threshold)]
    }
    train_clean<-rmNAvars(train,0.3)
    test_clean<-test[,intersect(colnames(test), colnames(train_clean))]

Replacing/Imputing Missing values with Median as Median is not sensitive to outliers For the variables that are not dropped at the previous step of modeling, variables that have missing values in lesser percentages are imputed. The methodology used for imputation is using median of the remaining data series. This is a commonly used industry practice and is efficient as the missing data for all variables is randomly distributed over the response variable.

manage_na <- function(datafra)
    {
      for(i in 1:ncol(datafra))
      {
        if(is.numeric(datafra[,i]))
        {
          datafra[is.na(datafra[,i]),i] <- median(datafra[!is.na(datafra[,i]),i])
        }
      }
      datafra
    }
    train_clean <- manage_na(train_clean)
    test_clean <- manage_na(test)
    train_conti<-train_clean[,c("Product_Info_4", "Ins_Age", "Ht", "Wt", "BMI",
                                "Employment_Info_1", "Employment_Info_4", "Employment_Info_6")]
    
    train_clean[, !(sapply(train_clean, class) == "numeric" | sapply(train_clean, class) == 
                      "integer")]<-
      as.numeric(train_clean[, !(sapply(train_clean, class) == "numeric" | 
                                   sapply(train_clean, class) == "integer")])
    
    test_clean[, !(sapply(test_clean, class) == "numeric" | sapply(test_clean, class) == 
                      "integer")]<-
      as.numeric(test_clean[, !(sapply(test_clean, class) == "numeric" | 
                                   sapply(test_clean, class) == "integer")])

Exploratory Data Analysis

Dividing data into Continuous, categorical and Dummy variables

temp1<- data.frame(Variable_Type = c(
"Product Information",
"Insurance Age",
"Height",
"Weight", 
"BMI",
"Employment Information",
"Insured Information",
"Insurance History",
"Family History",
"Medical History",
"Medical Keyword"))

temp1$Continous<-c(1,1,1,1,1,3,0,1,4,0,0)
temp1$Categorical<-c(6,0,0,0,0,3,7,8,1,41,0)
temp1$Dummy<-c(0,0,0,0,0,0,0,0,0,0,48)
temp1$Total<-rowSums(temp1[,-1])
temp1[12,2:5]<-colSums(temp1[,-1])
temp1$Variable_Type[12]<-"Total"
## Warning in `[<-.factor`(`*tmp*`, 12, value = structure(c(10L, 5L, 4L,
## 11L, : invalid factor level, NA generated
datatable(temp1, options = list(pageLength = 13,
  initComplete = JS(
    "function(settings, json) {",
    "$(this.api().table().header()).css({'background-color': '#000', 'color': '#fff'});",
    "}")))

Continuous variables are analyzed using summary statistics, box plots and density plots. The categorical variables are analyzed using event rate chart to track the variation to the response.

Histogram of Response Plot

The response is a ordinal variable with levels from 1 to 8 and associates to the risk level of a customer

library(ggplot2)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:MASS':
## 
##     select
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
p<-ggplot(train, aes(x=Response))+geom_histogram(fill="Red", alpha=0.3)
ggplotly(p, color=~Response, width = 800, height = 400)%>%
layout(title="Distribution of Response Variable", plot_bgcolor= "white", xaxis=list(gridcolor="lightgrey", opacity=0.5), yaxis=list(gridcolor="lightgrey",opacity = 0.5),autosize = F, width = 800, height = 400)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Specifying width/height in layout() is now deprecated.
## Please specify in ggplotly() or plot_ly()

While it is not mentioned whether the scale is in increasing order of riskiness or otherwise, from the distribution of the response variable we can infer that 8 could possibly refer to the customer which are at high risk to likely take insurance while 1 can be people with low risk to take. insurance.

Summary Statistics

To allow for easier convergence of machine learning algorithms variables are normalized to the range of [0, 1]. The most common normalizing function used is given below:

X=xi−xmin/xmax−xmin

The same function had been applied to the continuous variables in the input data-set. The summary statistics help understand the distribution of the underlying dataset, the box plots and density plots enable visualizing the data-set

## Generating Summary Table

summ_conti<-data.frame(Variables =  colnames(train_conti))
summ_conti$Min<-apply(train_conti,2,function(x){min(x, na.rm = T)})
summ_conti$Max<-apply(train_conti,2,function(x){max(x, na.rm = T)})
summ_conti$Mean<-apply(train_conti,2,function(x){mean(x, na.rm = T)})
summ_conti$Median<-apply(train_conti,2,function(x){median(x, na.rm = T)})
datatable(summ_conti, options = list(initComplete = JS(
    "function(settings, json) {",
    "$(this.api().table().header()).css({'background-color': '#000', 'color': '#fff'});",
    "}")))

Continuous Variable Analysis

The box plots enable visualization of the data-set especially in relation to outliers. However considering the large number of data

Boxplot

library(ggplot2)
library(plotly)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(grid)
train_cont <- data.frame(train_conti, Response=train_clean$Response)
doPlots <- function(data.in, fun, ii, ncol=3) {
  pp <- list()
  for (i in ii) {
    p <- fun(data.in=data.in, i=i)
    pp <- c(pp, list(p))
  }
  do.call("grid.arrange", c(pp, ncol=ncol))
}

plotBox <- function(data.in, i) {
  data <- data.frame(y=data.in[,i], Response=data.in$Response)
  
 p <- ggplot(data, aes(x=factor(Response), y=y)) + geom_boxplot() + ylab(colnames(data.in)[i]) + theme_light()
  return (p)
}

doPlots(data.in=train_cont, fun=plotBox, ii=1:8,ncol=3)

The box plots enable visualization of the data-set especially in relation to outliers as well as Response variable. We can see BMI and Employment_Info_6 show variation with respect to Respinse variable so we can keep them and eliminate all other continuous variables.

train_clean<- subset(train_clean, select = -c(Product_Info_4, Ins_Age, Ht, Wt, Employment_Info_1, Employment_Info_4) )
test_clean<- subset(test_clean, select = -c(Product_Info_4, Ins_Age, Ht, Wt, Employment_Info_1, Employment_Info_4) )

Density Plot

The density plots help visualize the characteristics of the distribution including statistical metrics such as mean, standard deviation and kurtosis. It also enables us to visually identify if any relationship exists with the response variable. For example: The density plot of variable Employment_Info_6 is similar to the histogram of the response variable, this probably indicated that this variable could be a good predictor of the response variable

library(reshape)
## 
## Attaching package: 'reshape'
## The following object is masked from 'package:plotly':
## 
##     rename
## The following objects are masked from 'package:tidyr':
## 
##     expand, smiths
## The following object is masked from 'package:dplyr':
## 
##     rename
temp_melt<-melt(train_conti[,1:2])
## Using  as id variables
   p1<-ggplot(temp_melt,aes(value, fill = variable ))+geom_density(alpha = 0.5)+ggtitle("Density Plots")
    ggplotly(p1, height= 800, width = 1000)%>%
      layout(plot_bgcolor="transparent",paper_bgcolor= "transparent",autosize = F, width = 1000, height = 800)
## Warning: Specifying width/height in layout() is now deprecated.
## Please specify in ggplotly() or plot_ly()
      temp_melt<-melt(train_conti[,c(3,4,5)])
## Using  as id variables
    p2<-ggplot(temp_melt,aes(value, fill = variable ))+geom_density(alpha = 0.5)+ggtitle("Density Plots")
    ggplotly(p2, height= 800, width = 1000)%>%
      layout(plot_bgcolor="transparent",paper_bgcolor= "transparent",autosize = F, width = 1000, height = 800)
## Warning: Specifying width/height in layout() is now deprecated.
## Please specify in ggplotly() or plot_ly()
    temp_melt<-melt(train_conti[,c(6,8)])
## Using  as id variables
    p3<-ggplot(temp_melt,aes(value, fill = variable ))+geom_density(alpha = 0.5)+ggtitle("Density Plots")
    ggplotly(p3, height= 800, width = 1000)%>%
      layout(plot_bgcolor="transparent",paper_bgcolor= "transparent",autosize = F, width = 1000, height = 800)
## Warning: Specifying width/height in layout() is now deprecated.
## Please specify in ggplotly() or plot_ly()
    temp_melt<-melt(train_conti[,7])
    temp_melt$variable<-"Employment_Info_4"
    p4<-ggplot(temp_melt,aes(value, fill = variable ))+geom_density(alpha = 0.5)+ggtitle("Density Plots")
    ggplotly(p4, height= 800, width = 1000)%>%
      layout(plot_bgcolor="transparent",paper_bgcolor= "transparent",autosize = F, width = 1000, height = 800)
## Warning: Specifying width/height in layout() is now deprecated.
## Please specify in ggplotly() or plot_ly()

Event Rate Chart

In an attempt to capture the conditional probability of the response given a specific bin of the categorical variable \[ P(y=1|ProdInfo_2= A_1)=\frac{P(y=1 \cap ProdInfo_2= A_1 )}{P(ProdInfo_2= A_1)} \] 1. Product Information

train_categ<-train_clean[,-which(colnames(train_clean) %in% colnames(train_conti))]
    i="Product_Info"
    train_temp<-train_categ[,grep(i,colnames(train_categ))]
    index<-1
    plt<-htmltools::tagList()
    for (i in colnames(train_temp)){
      data_freq<-as.data.frame(table(train_temp[,i],train_clean$Response)/(as.data.frame(table(train_temp[,i]))[,2]))
      p<-plot_ly(data_freq, x = ~Var1, y = ~Freq, color = ~Var2, type="bar")%>%
        layout(title = paste0("Event Rate Chart- ",gsub("_"," ",i)),
               xaxis = list(title = gsub("_"," ",i),showgrid = T))
      plt[[index]] <- as_widget(p)
      index <- index + 1
    }
    plt

2.Employment Information

i="Employment_Info"
    train_temp<-train_categ[,grep(i,colnames(train_categ))]
    index<-1
    plt<-htmltools::tagList()
    for (i in colnames(train_temp)){
      data_freq<-as.data.frame(table(train_temp[,i],train_clean$Response)/(as.data.frame(table(train_temp[,i]))[,2]))
      p<-plot_ly(data_freq, x = ~Var1, y = ~Freq, color = ~Var2, type="bar")%>%
        layout(title = paste0("Event Rate Chart- ",gsub("_"," ",i)),
               xaxis = list(title = gsub("_"," ",i),showgrid = T))
      plt[[index]] <- as_widget(p)
      index <- index + 1
    }
    plt
  1. Insured Information
i="InsuredInfo"
    train_temp<-train_categ[,grep(i,colnames(train_categ))]
    index<-1
    plt<-htmltools::tagList()
    for (i in colnames(train_temp)){
      data_freq<-as.data.frame(table(train_temp[,i],train_clean$Response)/(as.data.frame(table(train_temp[,i]))[,2]))
      p<-plot_ly(data_freq, x = ~Var1, y = ~Freq, color = ~Var2, type="bar")%>%
        layout(title = paste0("Event Rate Chart- ",gsub("_"," ",i)),
               xaxis = list(title = gsub("_"," ",i),showgrid = T))
      plt[[index]] <- as_widget(p)
      index <- index + 1
    }
    plt

4.Insurance History

i="Insurance_History"
    train_temp<-train_categ[,grep(i,colnames(train_categ))]
    index<-1
    plt<-htmltools::tagList()
    for (i in colnames(train_temp)){
      data_freq<-as.data.frame(table(train_temp[,i],train_clean$Response)/(as.data.frame(table(train_temp[,i]))[,2]))
      p<-plot_ly(data_freq, x = ~Var1, y = ~Freq, color = ~Var2, type="bar")%>%
        layout(title = paste0("Event Rate Chart- ",gsub("_"," ",i)),
               xaxis = list(title = gsub("_"," ",i),showgrid = T))
      plt[[index]] <- as_widget(p)
      index <- index + 1
    }
    plt

5.Medical History

 par(mfrow=c(2,2))  
  i="Medical_History"
    train_temp<-train_categ[,grep(i,colnames(train_categ))]
    index<-1
    plt<-htmltools::tagList()
    for (i in colnames(train_temp)){
      data_freq<-as.data.frame(table(train_temp[,i],train_clean$Response)/(as.data.frame(table(train_temp[,i]))[,2]))
      p<-plot_ly(data_freq, x = ~Var1, y = ~Freq, color = ~Var2, type="bar")%>%
        layout(title = paste0("Event Rate Chart- ",gsub("_"," ",i)),
               xaxis = list(title = gsub("_"," ",i),showgrid = T))
      plt[[index]] <- as_widget(p)
      index <- index + 1
    }
    plt

After looking at variables values, we see that columns Medical_Keyword_1 to Medical_Keyword_48 all have only zeros and ones, which may not have much predictive power, but adding them together might be significant . Below steps create new column and removes the columns from which it was created

# Creating a new column as a sum of all these column : MedKeywordSum
train_clean$MedKeywordSum <- rowSums(train_clean[,c(64:112)])
test_clean$MedKeywordSum <- rowSums(test_clean[,c(73:121)])
# Dropping Medical_Keyword_1 to Medical_Keyword_48 from dataset
train_clean <- subset(train_clean, select = -c(68:112) )
test_clean <- subset(test_clean, select = -c(73:121) )
# Creating a new column as a sum of all these column : MedHistSum
train_clean$MedHistSum <- rowSums(train_clean[,c(28:68)])
test_clean$MedHistSum <- rowSums(test_clean[,c(33:73)])
# Dropping Medical_hist_1 to Medical_hist_48 from dataset
train_clean <- subset(train_clean, select = -c(28:66) )
test_clean <- subset(test_clean, select = -c(33:72) )
#these were eliminated in training data due to alot of NA rows
test_clean<-subset(test_clean,select = -c(29:32))
train_clean<-subset(train_clean,select = -28)

Making a predictive model

A predictive model is built to predict response value using ** Multinomial Logistic Regression**. Below are the steps executed.

Preparing “test” dataset to contain same column as “train_clean” data set to use in predictive models.

Creating a Multinomila logistic regression model to predict Response.

library(caret)
## Warning: package 'caret' was built under R version 3.4.4
## Loading required package: lattice
## Warning in as.POSIXlt.POSIXct(Sys.time()): unknown timezone 'zone/tz/2018e.
## 1.0/zoneinfo/America/New_York'
library(nnet)
MultinomModel <- multinom(Response ~ ., data = train_clean)
## # weights:  248 (210 variable)
## initial  value 123479.318186 
## iter  10 value 109795.691165
## iter  20 value 106631.615491
## iter  30 value 103834.040782
## iter  40 value 100719.001427
## iter  50 value 98588.294990
## iter  60 value 98164.596527
## iter  70 value 97861.844228
## iter  80 value 97183.867580
## iter  90 value 95876.601964
## iter 100 value 94419.643160
## final  value 94419.643160 
## stopped after 100 iterations
predict_Response <- predict (MultinomModel, test_clean , "probs")

test_clean$Response <- predict (MultinomModel, test_clean)

Write Submission File

submission <- test_clean[, c(1,31)]
write.csv(submission, "~/Desktop/Programming for analytics/week 6/Prudential/submission.csv", row.names = F)

Summary

Accuracy obtained from Kaggle Kappa is 0.36174 for this model. There is a lot of scope for improvement. Overall we can see that age, BMI, Family history and product type as well are main factors to assess risk of the insurance.